Some additional data wrangling. Making one final time-series dataset which includes only the covid-related features, all time-series observations with the associated coordinates and class designations from the PCA.

no_covid <- read.csv("E:\\ProjectData\\unacast\\post-pca.csv")
covid <- read.csv("E:\\ProjectData\\unacast\\covid_ts.csv")
dims <- no_covid %>% dplyr::select(FIPS, Dim.1,Dim.2,class2,polyname)
ts_data <- inner_join(covid,dims,by = 'FIPS') %>% dplyr::select(-X)
ts_data$date <- as.Date(ts_data$date)

maps::county.fips %>%
  as_tibble %>% 
  extract(polyname, c("region", "subregion"), "^([^,]+),([^,]+)$") -> dfips
map_data("county") %>% 
  left_join(dfips) -> data
## Joining, by = c("region", "subregion")

Cool looking, but not very helpful…

plot_ly(dims, x = ~Dim.1, y = ~Dim.2, z = ~no_covid$Median.Household.Income, 
        marker = list(color = ~no_covid$Median.Household.Income, colorscale = c("Viridis"), showscale = TRUE)) %>%
  add_markers()  %>%
  layout(
    title = 'Counties by PCA Coordinates',
    scene = list(xaxis = list(title = 'Dim 1'),
                 yaxis = list(title = 'Dim 2'),
                 zaxis = list(title = 'MHHI')))
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

Pre-Covid Models

Fitting a linear regression model to see which variables are significant for predicting the metrics we care about before covid

Creating a subset based on a single day (2/24/2020)

day <- ts_data %>% filter(date == as.Date("2020-02-24"))
features <- day %>% dplyr::select(-c("FIPS","covid","date","county_name","state_code","polyname","week"))
vis_only <- features %>% dplyr::select(-c("daily_distance_diff","encounters_rate"))
dist_only <- features %>% dplyr::select(-c("daily_visitation_diff","encounters_rate"))
encounter_only <- features %>% dplyr::select(-c("daily_distance_diff","daily_visitation_diff"))

Basic LM - each of the three for 2/24/2020

#Visitation only
model_vis <- lm(daily_visitation_diff ~., data = vis_only)
summary(model_vis)
## 
## Call:
## lm(formula = daily_visitation_diff ~ ., data = vis_only)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.49531 -0.03238  0.01442  0.02863  0.58661 
## 
## Coefficients: (3 not defined because of singularities)
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -0.0588474  0.0042707 -13.779  < 2e-16 ***
## cases             0.0051519  0.0039278   1.312     0.19    
## deaths                   NA         NA      NA       NA    
## death_rate               NA         NA      NA       NA    
## death_per_capita         NA         NA      NA       NA    
## Dim.1             0.0045229  0.0004901   9.228  < 2e-16 ***
## Dim.2            -0.0067339  0.0011163  -6.032 1.81e-09 ***
## class2            0.0047926  0.0008216   5.834 6.01e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06529 on 3002 degrees of freedom
## Multiple R-squared:  0.04571,    Adjusted R-squared:  0.04444 
## F-statistic: 35.95 on 4 and 3002 DF,  p-value: < 2.2e-16
#Distance only
model_dist <- lm(daily_distance_diff ~., data = dist_only)
summary(model_dist)
## 
## Call:
## lm(formula = daily_distance_diff ~ ., data = dist_only)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.38959 -0.03679 -0.00357  0.03311  0.64803 
## 
## Coefficients: (3 not defined because of singularities)
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.0007875  0.0046599   0.169 0.865817    
## cases             0.0003569  0.0042858   0.083 0.933645    
## deaths                   NA         NA      NA       NA    
## death_rate               NA         NA      NA       NA    
## death_per_capita         NA         NA      NA       NA    
## Dim.1             0.0019696  0.0005348   3.683 0.000235 ***
## Dim.2            -0.0026702  0.0012180  -2.192 0.028440 *  
## class2            0.0021450  0.0008964   2.393 0.016784 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07124 on 3002 degrees of freedom
## Multiple R-squared:  0.007572,   Adjusted R-squared:  0.006249 
## F-statistic: 5.726 on 4 and 3002 DF,  p-value: 0.0001373
#Encounter rate only
model_encounter <- lm(encounters_rate ~., data = encounter_only)
summary(model_encounter)
## 
## Call:
## lm(formula = encounters_rate ~ ., data = encounter_only)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -689.4   -7.0    1.9   10.5 4606.3 
## 
## Coefficients: (3 not defined because of singularities)
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -36.1280     5.8083  -6.220 5.66e-10 ***
## cases            -12.0736     5.3420  -2.260   0.0239 *  
## deaths                 NA         NA      NA       NA    
## death_rate             NA         NA      NA       NA    
## death_per_capita       NA         NA      NA       NA    
## Dim.1            -10.4523     0.6666 -15.680  < 2e-16 ***
## Dim.2            -16.4602     1.5182 -10.842  < 2e-16 ***
## class2             7.9907     1.1174   7.151 1.07e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 88.79 on 3002 degrees of freedom
## Multiple R-squared:  0.1018, Adjusted R-squared:  0.1007 
## F-statistic:  85.1 on 4 and 3002 DF,  p-value: < 2.2e-16

Creating a subset based on a single day (3/08/2020)

day <- ts_data %>% filter(date == as.Date("2020-03-08"))
features <- day %>% dplyr::select(-c("FIPS","covid","date","county_name","state_code","polyname","week"))
vis_only <- features %>% dplyr::select(-c("daily_distance_diff","encounters_rate"))
dist_only <- features %>% dplyr::select(-c("daily_visitation_diff","encounters_rate"))
encounter_only <- features %>% dplyr::select(-c("daily_distance_diff","daily_visitation_diff"))

Basic LM - each of the three for 3/08/2020

#Visitation only
model_vis <- lm(daily_visitation_diff ~., data = vis_only)
summary(model_vis)
## 
## Call:
## lm(formula = daily_visitation_diff ~ ., data = vis_only)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.54732 -0.01780 -0.00307  0.02298  0.52668 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.716e-02  4.904e-03  -3.498 0.000475 ***
## cases             9.774e-05  7.521e-04   0.130 0.896610    
## deaths            4.482e-03  7.041e-03   0.637 0.524496    
## death_rate        1.212e-01  1.106e-01   1.095 0.273476    
## death_per_capita -2.132e+04  1.196e+04  -1.783 0.074711 .  
## Dim.1             5.984e-03  5.825e-04  10.273  < 2e-16 ***
## Dim.2            -4.005e-03  1.282e-03  -3.124 0.001798 ** 
## class2            2.150e-03  9.432e-04   2.280 0.022686 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07495 on 2999 degrees of freedom
## Multiple R-squared:  0.04543,    Adjusted R-squared:  0.0432 
## F-statistic: 20.39 on 7 and 2999 DF,  p-value: < 2.2e-16
#Distance only
model_dist <- lm(daily_distance_diff ~., data = dist_only)
summary(model_dist)
## 
## Call:
## lm(formula = daily_distance_diff ~ ., data = dist_only)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.44060 -0.04789  0.00305  0.04832  0.99506 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       3.691e-03  6.123e-03   0.603   0.5466    
## cases            -2.413e-03  9.390e-04  -2.570   0.0102 *  
## deaths            7.915e-03  8.791e-03   0.900   0.3680    
## death_rate        1.364e-01  1.381e-01   0.987   0.3235    
## death_per_capita -1.362e+04  1.493e+04  -0.912   0.3619    
## Dim.1            -1.393e-03  7.273e-04  -1.915   0.0556 .  
## Dim.2             1.749e-03  1.600e-03   1.093   0.2745    
## class2           -4.925e-03  1.178e-03  -4.182 2.97e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09358 on 2999 degrees of freedom
## Multiple R-squared:  0.01364,    Adjusted R-squared:  0.01134 
## F-statistic: 5.925 on 7 and 2999 DF,  p-value: 7.339e-07
#Encounter rate only
model_encounter <- lm(encounters_rate ~., data = encounter_only)
summary(model_encounter)
## 
## Call:
## lm(formula = encounters_rate ~ ., data = encounter_only)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -252.60   -2.74    0.62    3.67 1440.94 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.088e+01  1.891e+00  -5.753 9.67e-09 ***
## cases            -1.272e+00  2.899e-01  -4.388 1.18e-05 ***
## deaths            1.971e+00  2.714e+00   0.726    0.468    
## death_rate       -3.667e+00  4.264e+01  -0.086    0.931    
## death_per_capita  1.165e+05  4.610e+06   0.025    0.980    
## Dim.1            -4.417e+00  2.246e-01 -19.668  < 2e-16 ***
## Dim.2            -5.651e+00  4.941e-01 -11.438  < 2e-16 ***
## class2            2.509e+00  3.636e-01   6.900 6.33e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.89 on 2999 degrees of freedom
## Multiple R-squared:  0.1417, Adjusted R-squared:  0.1396 
## F-statistic:  70.7 on 7 and 2999 DF,  p-value: < 2.2e-16

Post-Covid Models

Here I fitted the same linear regression on each of the three adherence metrics to see which variables are significant for two randomly selected dates post covid…

Creating a subset based on a single day (4/1/2020)

day <- ts_data %>% filter(date == as.Date("2020-04-01"))
features <- day %>% dplyr::select(-c("FIPS","covid","date","county_name","state_code","polyname","week"))
vis_only <- features %>% dplyr::select(-c("daily_distance_diff","encounters_rate"))
dist_only <- features %>% dplyr::select(-c("daily_visitation_diff","encounters_rate"))
encounter_only <- features %>% dplyr::select(-c("daily_distance_diff","daily_visitation_diff"))

Basic LM - each of the three for 4/01/2020

#Visitation only
model_vis <- lm(daily_visitation_diff ~., data = vis_only)
summary(model_vis)
## 
## Call:
## lm(formula = daily_visitation_diff ~ ., data = vis_only)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7256 -0.1369 -0.0087  0.1546  0.8489 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -4.986e-01  1.245e-02 -40.051   <2e-16 ***
## cases             2.917e-05  1.324e-05   2.203   0.0277 *  
## deaths            9.861e-04  7.870e-04   1.253   0.2103    
## death_rate       -6.764e-02  5.789e-02  -1.169   0.2427    
## death_per_capita -2.902e+02  1.763e+02  -1.646   0.0998 .  
## Dim.1             5.668e-02  1.622e-03  34.942   <2e-16 ***
## Dim.2            -5.340e-02  3.288e-03 -16.241   <2e-16 ***
## class2            4.168e-02  2.391e-03  17.431   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1894 on 2999 degrees of freedom
## Multiple R-squared:  0.4086, Adjusted R-squared:  0.4073 
## F-statistic: 296.1 on 7 and 2999 DF,  p-value: < 2.2e-16
#Distance only
model_dist <- lm(daily_distance_diff ~., data = dist_only)
summary(model_dist)
## 
## Call:
## lm(formula = daily_distance_diff ~ ., data = dist_only)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.53484 -0.07976 -0.00942  0.06862  0.70076 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -2.886e-01  9.030e-03 -31.963  < 2e-16 ***
## cases            -1.037e-05  9.606e-06  -1.080   0.2804    
## deaths           -5.911e-04  5.709e-04  -1.035   0.3006    
## death_rate        6.604e-02  4.199e-02   1.573   0.1159    
## death_per_capita -2.700e+02  1.279e+02  -2.111   0.0348 *  
## Dim.1             2.156e-02  1.177e-03  18.319  < 2e-16 ***
## Dim.2            -2.199e-02  2.385e-03  -9.219  < 2e-16 ***
## class2            7.915e-03  1.735e-03   4.564 5.23e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1374 on 2999 degrees of freedom
## Multiple R-squared:  0.1819, Adjusted R-squared:   0.18 
## F-statistic: 95.28 on 7 and 2999 DF,  p-value: < 2.2e-16
#Encounter rate only
model_encounter <- lm(encounters_rate ~., data = encounter_only)
summary(model_encounter)
## 
## Call:
## lm(formula = encounters_rate ~ ., data = encounter_only)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.970  -0.307   0.023   0.264  62.130 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.065e+00  1.164e-01  -9.150  < 2e-16 ***
## cases            -1.471e-04  1.238e-04  -1.189    0.235    
## deaths           -3.681e-02  7.358e-03  -5.003 5.97e-07 ***
## death_rate       -3.790e-01  5.412e-01  -0.700    0.484    
## death_per_capita  1.643e+03  1.648e+03   0.997    0.319    
## Dim.1            -4.161e-01  1.517e-02 -27.434  < 2e-16 ***
## Dim.2            -3.402e-01  3.074e-02 -11.068  < 2e-16 ***
## class2            1.025e-01  2.235e-02   4.587 4.69e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.771 on 2999 degrees of freedom
## Multiple R-squared:  0.2378, Adjusted R-squared:  0.236 
## F-statistic: 133.6 on 7 and 2999 DF,  p-value: < 2.2e-16

Creating a subset based on a single day (5/17/2020)

day <- ts_data %>% filter(date == as.Date("2020-05-17"))
features <- day %>% dplyr::select(-c("FIPS","covid","date","county_name","state_code","polyname","week"))
vis_only <- features %>% dplyr::select(-c("daily_distance_diff","encounters_rate"))
dist_only <- features %>% dplyr::select(-c("daily_visitation_diff","encounters_rate"))
encounter_only <- features %>% dplyr::select(-c("daily_distance_diff","daily_visitation_diff"))

Basic LM - each of the three for 5/17/2020

#Visitation only
model_vis <- lm(daily_visitation_diff ~., data = vis_only)
summary(model_vis)
## 
## Call:
## lm(formula = daily_visitation_diff ~ ., data = vis_only)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.89200 -0.09174 -0.01790  0.07898  1.23437 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.808e-01  1.408e-02 -12.845  < 2e-16 ***
## cases             1.757e-05  5.513e-06   3.188  0.00145 ** 
## deaths           -1.279e-04  9.313e-05  -1.373  0.16986    
## death_rate        1.880e-02  7.175e-02   0.262  0.79330    
## death_per_capita -1.031e+02  2.081e+01  -4.956 7.60e-07 ***
## Dim.1             6.200e-02  1.924e-03  32.222  < 2e-16 ***
## Dim.2            -2.351e-02  3.696e-03  -6.363 2.28e-10 ***
## class2            1.927e-02  2.655e-03   7.256 5.07e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2068 on 2999 degrees of freedom
## Multiple R-squared:  0.3611, Adjusted R-squared:  0.3596 
## F-statistic: 242.2 on 7 and 2999 DF,  p-value: < 2.2e-16
#Distance only
model_dist <- lm(daily_distance_diff ~., data = dist_only)
summary(model_dist)
## 
## Call:
## lm(formula = daily_distance_diff ~ ., data = dist_only)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.52203 -0.09451 -0.01384  0.06880  1.75304 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.573e-01  1.110e-02 -14.176  < 2e-16 ***
## cases             9.364e-06  4.346e-06   2.155 0.031268 *  
## deaths           -1.043e-04  7.341e-05  -1.421 0.155503    
## death_rate       -1.368e-01  5.656e-02  -2.418 0.015648 *  
## death_per_capita -3.994e+01  1.641e+01  -2.434 0.014971 *  
## Dim.1             2.984e-02  1.517e-03  19.675  < 2e-16 ***
## Dim.2            -9.262e-03  2.913e-03  -3.179 0.001490 ** 
## class2            7.740e-03  2.093e-03   3.698 0.000221 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.163 on 2999 degrees of freedom
## Multiple R-squared:  0.1812, Adjusted R-squared:  0.1793 
## F-statistic: 94.79 on 7 and 2999 DF,  p-value: < 2.2e-16
#Encounter rate only
model_encounter <- lm(encounters_rate ~., data = encounter_only)
summary(model_encounter)
## 
## Call:
## lm(formula = encounters_rate ~ ., data = encounter_only)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.154  -0.723   0.044   0.651 138.405 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.189e+00  2.829e-01  -4.204 2.70e-05 ***
## cases            -6.744e-04  1.108e-04  -6.087 1.29e-09 ***
## deaths            4.814e-03  1.871e-03   2.572   0.0101 *  
## death_rate       -3.214e+00  1.442e+00  -2.229   0.0259 *  
## death_per_capita  3.337e+02  4.182e+02   0.798   0.4250    
## Dim.1            -1.038e+00  3.866e-02 -26.860  < 2e-16 ***
## Dim.2            -8.405e-01  7.426e-02 -11.318  < 2e-16 ***
## class2            2.788e-01  5.335e-02   5.226 1.85e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.155 on 2999 degrees of freedom
## Multiple R-squared:  0.2291, Adjusted R-squared:  0.2273 
## F-statistic: 127.3 on 7 and 2999 DF,  p-value: < 2.2e-16

What are the distributions of the response variables for each of the days?

Here, it looks like I may have 3 different distributions of the response variables, this suggests that I may need to consider different formula families for the GLM to model them properly. It may also be the case that they are all poisson distributed.

#visualizing the distribution of the response variables to help decide the distribution

#Normal-ish? Higher lambda
vis_only_nontrivial <- vis_only %>% filter(daily_visitation_diff != 0) 
vis_only_nontrivial %>% ggplot(aes(x = daily_visitation_diff)) + geom_histogram(bins = 50)

#Skew right - Lower lambda
dist_only_nontrivial <- dist_only %>% filter(daily_distance_diff != 0) 
dist_only_nontrivial %>% ggplot(aes(x = daily_distance_diff)) + geom_histogram(bins = 50)

#Very skew right - very low lambda
encounter_only_nontrivial <- encounter_only %>% filter(encounters_rate != 0) 
encounter_only_nontrivial %>% ggplot(aes(x = encounters_rate)) + geom_histogram(bins = 50)

Turns out it isnt poisson, I should have caught the negative values… So, I ran this, and it looks logistic.

dist <- descdist(vis_only$daily_visitation_diff,  discrete = FALSE)

ft <- fitdist(vis_only$daily_visitation_diff, distr = "logis" )
ft
## Fitting of the distribution ' logis ' by maximum likelihood 
## Parameters:
##             estimate  Std. Error
## location -0.07818859 0.004320910
## scale     0.13821661 0.002148809
denscomp(ft)

This still doesnt work…

# 
# vis_glm <- glm(daily_visitation_diff~., family=gaussian("log"), data= vis_only)
# summary(vis_glm)